## 
## Friday October 23rd
## Day 3: “Data Exploration, Part 2: Basic RNA-seq Plots”
##

# before session:
# have 'ggplot2', 'pheatmap', 'RColorBrewer', 'reshape2' installed

# code:
# install.packages("ggplot2")


## Step 1: call ("activate") libraries (packages) we are going to use
library(ggplot2)

## Step 2: set working directory to source file (this script) location
getwd()
## [1] "/Users/florschlamp/Desktop/Day 3"
# Session > Set Working Directory > To Source File Location


## Step 3: read in data files
# counts and metadata
load("StartData.Rdata")

head(raw_counts_filt)
##                   C1   T1   C2   T2   C3    T3   C4   T4
## ENSG00000000003  723  486  904  445 1170  1097  806  604
## ENSG00000000419  467  523  616  371  582   781  417  509
## ENSG00000000457  347  258  364  237  318   447  330  324
## ENSG00000000460   96   81   73   66  118    94  102   74
## ENSG00000000938    0    0    1    0    2     0    0    0
## ENSG00000000971 3413 3916 6000 4308 6424 10723 5039 7803
sample_table
##           id     dex celltype sample_name
## 1 SRR1039508 control   N61311          C1
## 2 SRR1039509 treated   N61311          T1
## 3 SRR1039512 control  N052611          C2
## 4 SRR1039513 treated  N052611          T2
## 5 SRR1039516 control  N080611          C3
## 6 SRR1039517 treated  N080611          T3
## 7 SRR1039520 control  N061011          C4
## 8 SRR1039521 treated  N061011          T4
## Step 4: normalize counts by library size and log scale
lib_sizes=colSums(raw_counts_filt)
size.factor=lib_sizes/exp(mean(log(lib_sizes)))
norm_counts=t(t(raw_counts_filt)/size.factor)
head(norm_counts)
##                        C1        T1           C2         T2          C3
## ENSG00000000003  757.0258  554.9712  767.9000260  628.24187 1031.494629
## ENSG00000000419  488.9780  597.2221  523.2593097  523.77019  513.102457
## ENSG00000000457  363.3305  294.6144  309.1986830  334.59174  280.354950
## ENSG00000000460  100.5180   92.4952   62.0096260   93.17745  104.031082
## ENSG00000000938    0.0000    0.0000    0.8494469    0.00000    1.763239
## ENSG00000000971 3573.6226 4471.7434 5096.6815883 6081.94603 5663.522647
##                         T3        C4         T4
## ENSG00000000003  763.20953  906.0354  610.94038
## ENSG00000000419  543.36066  468.7553  514.84876
## ENSG00000000457  310.98875  370.9574  327.72298
## ENSG00000000460   65.39808  114.6596   74.85031
## ENSG00000000938    0.00000    0.0000    0.00000
## ENSG00000000971 7460.25140 5664.4075 7892.66184
norm_log_counts = log2(norm_counts+1)
head(norm_log_counts)
##                        C1        T1         C2        T2        C3        T3
## ENSG00000000003  9.566103  9.118866  9.5866522  9.297471 10.011919  9.577824
## ENSG00000000419  8.936573  9.224537  9.0341368  9.035542  9.005912  9.088419
## ENSG00000000457  8.509104  8.207573  8.2770488  8.390563  8.136248  8.285350
## ENSG00000000460  6.665591  6.546820  5.9775003  6.557310  6.714673  6.053070
## ENSG00000000938  0.000000  0.000000  0.8870939  0.000000  1.466360  0.000000
## ENSG00000000971 11.803575 12.126944 12.3156255 12.570554 12.467739 12.865202
##                        C4        T4
## ENSG00000000003  9.825015  9.257247
## ENSG00000000419  8.875766  9.010804
## ENSG00000000457  8.538994  8.360729
## ENSG00000000460  6.853741  6.245083
## ENSG00000000938  0.000000  0.000000
## ENSG00000000971 12.467964 12.946479
# Plot 1) comparing replicates
# pairwise scatterplot

norm_log_counts <- data.frame(norm_log_counts)

## base plot
# basic
plot(norm_log_counts$C1,norm_log_counts$C2)

## ggplot
# basic
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point() 

# add line
ggplot(norm_log_counts, aes(x=C1, y=C2)) + 
  geom_point() + geom_abline(slope=1, color="red")

# add better labels
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point() + geom_abline(slope=1, color="red") +
  labs(title="pairwise scatterplot between samples C1 and C2",
       x="norm counts for sample C1",
       y="norm counts for sample C2")

# themes
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point() + theme_bw()

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point() + theme_classic()

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point() + theme_grey() # default?

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point() + theme_minimal()

# manipulate points
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point(alpha=0.25) 

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
  geom_point(alpha=0.25, color="blue") 

# filtered counts
norm_log_counts_filt <- read.csv("normalized_counts_after_log_and_filtering.csv", row.names = 1)

ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
  geom_point(alpha=0.25, color="blue") 

# let's add some stats
rsq <- cor(norm_log_counts_filt$C1,norm_log_counts_filt$C2) ^ 2
rsq
## [1] 0.8649412
ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
  geom_point(alpha=0.25) + geom_abline(slope=1, color="red") +
  labs(title="pairwise scatterplot between samples C1 and C2",
       subtitle="R squared = 0.8649412",
       x="norm counts for sample C1",
       y="norm counts for sample C2")

ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
  geom_point(alpha=0.25) + geom_abline(slope=1, color="red") +
  labs(title="pairwise scatterplot between samples C1 and C2",
       subtitle=paste0("R squared = ",rsq),
       x="norm counts for sample C1",
       y="norm counts for sample C2")

round(rsq,digits=3)
## [1] 0.865
# final form:
ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
  geom_point(alpha=0.25) + geom_abline(slope=1, color="red") +
  labs(title="pairwise scatterplot between samples C1 and C2",
       subtitle=paste0("R squared = ",round(rsq,digits=3)),
       x="norm counts for sample C1",
       y="norm counts for sample C2")

## Any questions so far?



cor(norm_log_counts_filt$C1,norm_log_counts_filt$C2) ^ 2
## [1] 0.8649412
cor(norm_log_counts_filt$C1,norm_log_counts_filt$T1) ^ 2
## [1] 0.913021
# Plot 2) Comparing multiple samples
# heatmap of sample distances
library(pheatmap)

# calculate distances
sampleDists <- dist(t(norm_log_counts_filt))
sampleDistMatrix <- as.matrix(sampleDists)

# simple heatmap
pheatmap(sampleDistMatrix)

# change colors manually
library(RColorBrewer)
## for more colors: https://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3

colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(50)

pheatmap(sampleDistMatrix,
         col=colors)

# add group labels
sample_table
##           id     dex celltype sample_name
## 1 SRR1039508 control   N61311          C1
## 2 SRR1039509 treated   N61311          T1
## 3 SRR1039512 control  N052611          C2
## 4 SRR1039513 treated  N052611          T2
## 5 SRR1039516 control  N080611          C3
## 6 SRR1039517 treated  N080611          T3
## 7 SRR1039520 control  N061011          C4
## 8 SRR1039521 treated  N061011          T4
rownames(sampleDistMatrix)
## [1] "C1" "T1" "C2" "T2" "C3" "T3" "C4" "T4"
annotation <- data.frame(row.names = sample_table$sample_name, #HAS to be the same as matrix
                         cell_type = sample_table$celltype,
                         treatment = sample_table$dex)

annotation
##    cell_type treatment
## C1    N61311   control
## T1    N61311   treated
## C2   N052611   control
## T2   N052611   treated
## C3   N080611   control
## T3   N080611   treated
## C4   N061011   control
## T4   N061011   treated
pheatmap(sampleDistMatrix,
         col=colors,
         annotation_col = annotation)

## show examples from other datasets



# Plot 3) Plotting genes of interest
# A) plot individual genes

gene_of_interest <- "ENSG00000179094"

norm_counts_subset <- norm_log_counts[gene_of_interest,]
norm_counts_subset
##                       C1       T1       C2       T2       C3       T3       C4
## ENSG00000179094 7.475356 10.63531 7.273795 10.25692 7.846876 9.997636 7.516538
##                       T4
## ENSG00000179094 10.33049
data_to_plot <- data.frame(row.names = sample_table$sample_name,
                           cell_type = sample_table$celltype,
                           treatment = sample_table$dex,
                           norm_counts = as.numeric(norm_counts_subset))

data_to_plot
##    cell_type treatment norm_counts
## C1    N61311   control    7.475356
## T1    N61311   treated   10.635307
## C2   N052611   control    7.273795
## T2   N052611   treated   10.256918
## C3   N080611   control    7.846876
## T3   N080611   treated    9.997636
## C4   N061011   control    7.516538
## T4   N061011   treated   10.330487
# basic plot
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
  geom_point()

ggplot(data_to_plot, aes(x=cell_type, y=norm_counts)) +
  geom_point()

ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
  geom_point(size=3,aes(color=cell_type))

# add lines
ggplot(data_to_plot, aes(x=treatment, y=norm_counts, group=cell_type)) +
  geom_point(size=3,aes(color=cell_type)) +
  geom_line()

ggplot(data_to_plot, aes(x=treatment, y=norm_counts, group=cell_type)) +
  geom_point(size=3,aes(color=cell_type)) +
  geom_line(aes(color=cell_type))

# boxplots instead
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
  geom_boxplot()

ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
  geom_boxplot(aes(fill=treatment))

# add points
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
  geom_boxplot() + geom_point()

# add labels
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
  geom_boxplot(aes(fill=treatment)) + geom_point() +
  labs(title=gene_of_interest)

# B) plot multiple genes
genes_of_interest <- c("ENSG00000152583",
                       "ENSG00000179094",
                       "ENSG00000116584",
                       "ENSG00000189221",
                       "ENSG00000120129",
                       "ENSG00000148175",
                       "ENSG00000178695",
                       "ENSG00000109906",
                       "ENSG00000134686",
                       "ENSG00000101347")

norm_counts_subset <- norm_log_counts[genes_of_interest,]
norm_counts_subset
##                        C1        T1        C2        T2        C3        T3
## ENSG00000152583  5.923209 11.089208  6.832592 10.645621  6.677878 10.367184
## ENSG00000179094  7.475356 10.635307  7.273795 10.256918  7.846876  9.997636
## ENSG00000116584 11.674119 10.566361 11.603869 10.423378 11.648400 10.508935
## ENSG00000189221  9.017501 12.186827  8.823067 12.059141  9.064100 12.135343
## ENSG00000120129  9.467130 12.515039  9.998428 12.690950  9.460522 12.090248
## ENSG00000148175 12.840301 14.322178 12.860022 14.001677 12.922685 14.191447
## ENSG00000178695 12.316632  9.941118 11.999167  9.254766 12.184040  9.915834
## ENSG00000109906  3.761790  9.875987  3.780489  9.479676  2.842242  8.804000
## ENSG00000134686 10.785697 12.012384 10.544461 12.002685 10.839301 12.002460
## ENSG00000101347 10.868827 14.424318 10.923317 14.938754 10.635319 13.704838
##                        C4       T4
## ENSG00000152583  6.470551 10.99161
## ENSG00000179094  7.516538 10.33049
## ENSG00000116584 11.584999 10.54297
## ENSG00000189221  8.062936 11.78962
## ENSG00000120129  9.617708 12.76383
## ENSG00000148175 12.910653 14.37836
## ENSG00000178695 12.296746  9.28792
## ENSG00000109906  4.863088 10.32481
## ENSG00000134686 10.504055 12.10096
## ENSG00000101347 11.085331 15.13546
# flip the table
t(norm_counts_subset)
##    ENSG00000152583 ENSG00000179094 ENSG00000116584 ENSG00000189221
## C1        5.923209        7.475356        11.67412        9.017501
## T1       11.089208       10.635307        10.56636       12.186827
## C2        6.832592        7.273795        11.60387        8.823067
## T2       10.645621       10.256918        10.42338       12.059141
## C3        6.677878        7.846876        11.64840        9.064100
## T3       10.367184        9.997636        10.50893       12.135343
## C4        6.470551        7.516538        11.58500        8.062936
## T4       10.991606       10.330487        10.54297       11.789618
##    ENSG00000120129 ENSG00000148175 ENSG00000178695 ENSG00000109906
## C1        9.467130        12.84030       12.316632        3.761790
## T1       12.515039        14.32218        9.941118        9.875987
## C2        9.998428        12.86002       11.999167        3.780489
## T2       12.690950        14.00168        9.254766        9.479676
## C3        9.460522        12.92268       12.184040        2.842242
## T3       12.090248        14.19145        9.915834        8.804000
## C4        9.617708        12.91065       12.296746        4.863088
## T4       12.763834        14.37836        9.287920       10.324809
##    ENSG00000134686 ENSG00000101347
## C1        10.78570        10.86883
## T1        12.01238        14.42432
## C2        10.54446        10.92332
## T2        12.00269        14.93875
## C3        10.83930        10.63532
## T3        12.00246        13.70484
## C4        10.50405        11.08533
## T4        12.10096        15.13546
sample_table
##           id     dex celltype sample_name
## 1 SRR1039508 control   N61311          C1
## 2 SRR1039509 treated   N61311          T1
## 3 SRR1039512 control  N052611          C2
## 4 SRR1039513 treated  N052611          T2
## 5 SRR1039516 control  N080611          C3
## 6 SRR1039517 treated  N080611          T3
## 7 SRR1039520 control  N061011          C4
## 8 SRR1039521 treated  N061011          T4
metadata_subset <- data.frame(row.names = sample_table$sample_name,
                              cell_type = sample_table$celltype,
                              treatment = sample_table$dex)
metadata_subset
##    cell_type treatment
## C1    N61311   control
## T1    N61311   treated
## C2   N052611   control
## T2   N052611   treated
## C3   N080611   control
## T3   N080611   treated
## C4   N061011   control
## T4   N061011   treated
data_to_plot <- cbind(metadata_subset,t(norm_counts_subset))
data_to_plot
##    cell_type treatment ENSG00000152583 ENSG00000179094 ENSG00000116584
## C1    N61311   control        5.923209        7.475356        11.67412
## T1    N61311   treated       11.089208       10.635307        10.56636
## C2   N052611   control        6.832592        7.273795        11.60387
## T2   N052611   treated       10.645621       10.256918        10.42338
## C3   N080611   control        6.677878        7.846876        11.64840
## T3   N080611   treated       10.367184        9.997636        10.50893
## C4   N061011   control        6.470551        7.516538        11.58500
## T4   N061011   treated       10.991606       10.330487        10.54297
##    ENSG00000189221 ENSG00000120129 ENSG00000148175 ENSG00000178695
## C1        9.017501        9.467130        12.84030       12.316632
## T1       12.186827       12.515039        14.32218        9.941118
## C2        8.823067        9.998428        12.86002       11.999167
## T2       12.059141       12.690950        14.00168        9.254766
## C3        9.064100        9.460522        12.92268       12.184040
## T3       12.135343       12.090248        14.19145        9.915834
## C4        8.062936        9.617708        12.91065       12.296746
## T4       11.789618       12.763834        14.37836        9.287920
##    ENSG00000109906 ENSG00000134686 ENSG00000101347
## C1        3.761790        10.78570        10.86883
## T1        9.875987        12.01238        14.42432
## C2        3.780489        10.54446        10.92332
## T2        9.479676        12.00269        14.93875
## C3        2.842242        10.83930        10.63532
## T3        8.804000        12.00246        13.70484
## C4        4.863088        10.50405        11.08533
## T4       10.324809        12.10096        15.13546
# 'melt' the table
library(reshape2)

data_to_plot_melted <- melt(data_to_plot)
## Using cell_type, treatment as id variables
head(data_to_plot_melted)
##   cell_type treatment        variable     value
## 1    N61311   control ENSG00000152583  5.923209
## 2    N61311   treated ENSG00000152583 11.089208
## 3   N052611   control ENSG00000152583  6.832592
## 4   N052611   treated ENSG00000152583 10.645621
## 5   N080611   control ENSG00000152583  6.677878
## 6   N080611   treated ENSG00000152583 10.367184
colnames(data_to_plot_melted) <- c("cell_type","treatment","gene","norm_counts")
head(data_to_plot_melted)
##   cell_type treatment            gene norm_counts
## 1    N61311   control ENSG00000152583    5.923209
## 2    N61311   treated ENSG00000152583   11.089208
## 3   N052611   control ENSG00000152583    6.832592
## 4   N052611   treated ENSG00000152583   10.645621
## 5   N080611   control ENSG00000152583    6.677878
## 6   N080611   treated ENSG00000152583   10.367184
# plot by gene
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
  geom_point()

ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
  geom_point() +
  theme(axis.text.x = element_text(angle = 90))

ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
  geom_point(aes(color=treatment)) +
  theme(axis.text.x = element_text(angle = 90))

# plot by treatment
ggplot(data_to_plot_melted, aes(x=treatment, y=norm_counts)) +
  geom_point(aes(color=gene))

ggplot(data_to_plot_melted, aes(x=treatment, y=norm_counts)) +
  geom_point(aes(color=gene)) +
  stat_summary(fun = mean, geom="line", mapping = aes(group=gene, color=gene))

# final form:
ggplot(data_to_plot_melted, aes(x=treatment, y=norm_counts)) +
  geom_point(aes(color=gene)) +
  stat_summary(fun = mean, geom="line", mapping = aes(group=gene, color=gene)) +
  labs(title="normalized counts of top 10 most variable genes",
       subtitle="grouped by treatment",
       y="log transformed normalized counts",
       x="dex status")

# adding more stats:
# smaller subset (10 genes is too much)

genes_of_interest <- c("ENSG00000152583",
                       "ENSG00000179094",
                       "ENSG00000116584")

norm_counts_subset <- norm_log_counts[genes_of_interest,]
rownames(norm_counts_subset) <- c("SPARCL1","PER1","ARHGEF2")
metadata_subset <- data.frame(row.names = sample_table$sample_name,
                              cell_type = sample_table$celltype,
                              treatment = sample_table$dex)
data_to_plot <- cbind(metadata_subset,t(norm_counts_subset))
data_to_plot_melted <- melt(data_to_plot)
## Using cell_type, treatment as id variables
colnames(data_to_plot_melted) <- c("cell_type","treatment","gene","norm_counts")
head(data_to_plot_melted)
##   cell_type treatment    gene norm_counts
## 1    N61311   control SPARCL1    5.923209
## 2    N61311   treated SPARCL1   11.089208
## 3   N052611   control SPARCL1    6.832592
## 4   N052611   treated SPARCL1   10.645621
## 5   N080611   control SPARCL1    6.677878
## 6   N080611   treated SPARCL1   10.367184
# plot by gene
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
  geom_point(aes(color=treatment))

# boxplot?
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
  geom_boxplot(aes(fill=treatment))

# points might be better
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
  geom_point(aes(color=treatment)) +
  stat_summary(fun.min=function(x)(mean(x)-sd(x)), 
               fun.max=function(x)(mean(x)+sd(x)),
               geom="errorbar", aes(group=treatment),
               width=0.1, position=position_dodge(.4)) +
  stat_summary(fun=mean, geom="point", shape=23, color="black", aes(fill=treatment), 
               size=2,position=position_dodge(.4)) 

## Homework for next session:
# install the DESeq2 package (from Bioconductor)